In [1]:
import plotly.tools as tls
tls.embed('https://plot.ly/~chris/7365')


Out[1]:

In [2]:
import pandas as pd
from sqlalchemy import create_engine #database connection
import datetime as dt
from IPython.display import display

import plotly.plotly as py #interactive graphing
from plotly.graph_objs import Bar, Scatter, Marker, Layout

In [3]:
import zipfile
import requests
import io
import random
import seaborn as sns
import numpy as np
from datetime import datetime
%matplotlib inline

In [ ]:

Decided to switch to just playing with Citibike data for now.

In [6]:
r = requests.get('https://s3.amazonaws.com/tripdata/201606-citibike-tripdata.zip')
with zipfile.ZipFile(io.BytesIO(r.content)) as ar:
    trip_data = pd.read_csv(ar.open('201606-citibike-tripdata.csv'))

In [4]:


In [7]:
trip_data.head(5)


Out[7]:
tripduration starttime stoptime start station id start station name start station latitude start station longitude end station id end station name end station latitude end station longitude bikeid usertype birth year gender
0 1470 6/1/2016 00:00:18 6/1/2016 00:24:48 380 W 4 St & 7 Ave S 40.734011 -74.002939 3236 W 42 St & Dyer Ave 40.758985 -73.993800 19859 Subscriber 1972.0 1
1 229 6/1/2016 00:00:20 6/1/2016 00:04:09 3092 Berry St & N 8 St 40.719009 -73.958525 3103 N 11 St & Wythe Ave 40.721533 -73.957824 16233 Subscriber 1967.0 1
2 344 6/1/2016 00:00:21 6/1/2016 00:06:06 449 W 52 St & 9 Ave 40.764618 -73.987895 469 Broadway & W 53 St 40.763441 -73.982681 22397 Subscriber 1989.0 1
3 1120 6/1/2016 00:00:28 6/1/2016 00:19:09 522 E 51 St & Lexington Ave 40.757148 -73.972078 401 Allen St & Rivington St 40.720196 -73.989978 16231 Subscriber 1991.0 1
4 229 6/1/2016 00:00:53 6/1/2016 00:04:42 335 Washington Pl & Broadway 40.729039 -73.994046 285 Broadway & E 14 St 40.734546 -73.990741 15400 Subscriber 1989.0 1

In [8]:
len(trip_data['bikeid'].unique())


Out[8]:
7882

In [9]:
len(trip_data['start station name'].unique())


Out[9]:
475

In [10]:
trip_data.describe()


Out[10]:
tripduration start station id start station latitude start station longitude end station id end station latitude end station longitude bikeid birth year gender
count 1.460318e+06 1.460318e+06 1.460318e+06 1.460318e+06 1.460318e+06 1.460318e+06 1.460318e+06 1.460318e+06 1.265112e+06 1.460318e+06
mean 9.897580e+02 9.562393e+02 4.073736e+01 -7.398694e+01 9.457872e+02 4.073696e+01 -7.398677e+01 2.051037e+04 1.977659e+03 1.079696e+00
std 7.589179e+03 1.101579e+03 2.277217e-02 1.641347e-02 1.093657e+03 7.874038e-02 1.378920e-01 3.454210e+03 1.157717e+01 5.872326e-01
min 6.100000e+01 7.200000e+01 4.067891e+01 -7.401713e+01 7.200000e+01 0.000000e+00 -7.408364e+01 1.452900e+04 1.885000e+03 0.000000e+00
25% 4.050000e+02 3.280000e+02 4.072080e+01 -7.399906e+01 3.280000e+02 4.072066e+01 -7.399915e+01 1.748200e+04 1.970000e+03 1.000000e+00
50% 6.760000e+02 4.530000e+02 4.073818e+01 -7.398890e+01 4.500000e+02 4.073782e+01 -7.398890e+01 2.052900e+04 1.980000e+03 1.000000e+00
75% 1.143000e+03 5.300000e+02 4.075383e+01 -7.397769e+01 5.290000e+02 4.075300e+01 -7.397771e+01 2.362800e+04 1.987000e+03 1.000000e+00
max 3.129291e+06 3.260000e+03 4.078721e+01 -7.392989e+01 3.260000e+03 4.078721e+01 0.000000e+00 2.603700e+04 2.000000e+03 2.000000e+00

In [11]:
trip_data[trip_data['gender'] > 0].describe()


Out[11]:
tripduration start station id start station latitude start station longitude end station id end station latitude end station longitude bikeid birth year gender
count 1.262082e+06 1.262082e+06 1.262082e+06 1.262082e+06 1.262082e+06 1.262082e+06 1.262082e+06 1.262082e+06 1.262000e+06 1.262082e+06
mean 8.299706e+02 9.265478e+02 4.073735e+01 -7.398685e+01 9.140904e+02 4.073700e+01 -7.398684e+01 2.059188e+04 1.977671e+03 1.249285e+00
std 4.885603e+03 1.081278e+03 2.215324e-02 1.618760e-02 1.071076e+03 5.584185e-02 9.454664e-02 3.461173e+03 1.155104e+01 4.325992e-01
min 6.100000e+01 7.200000e+01 4.067891e+01 -7.401713e+01 7.200000e+01 0.000000e+00 -7.405570e+01 1.452900e+04 1.885000e+03 1.000000e+00
25% 3.800000e+02 3.280000e+02 4.072205e+01 -7.399810e+01 3.270000e+02 4.072179e+01 -7.399852e+01 1.755400e+04 1.970000e+03 1.000000e+00
50% 6.160000e+02 4.500000e+02 4.073827e+01 -7.398889e+01 4.490000e+02 4.073782e+01 -7.398890e+01 2.069300e+04 1.980000e+03 1.000000e+00
75% 1.013000e+03 5.270000e+02 4.075207e+01 -7.397771e+01 5.260000e+02 4.075187e+01 -7.397771e+01 2.374100e+04 1.987000e+03 1.000000e+00
max 2.167999e+06 3.260000e+03 4.078721e+01 -7.392989e+01 3.260000e+03 4.078721e+01 0.000000e+00 2.603700e+04 2.000000e+03 2.000000e+00

In [14]:
trip_data.starttime = trip_data.starttime.map(lambda x: np.datetime64(
        datetime.strptime(x,"%m/%d/%Y %H:%M:%S")))
trip_data.stoptime = trip_data.stoptime.map(lambda x: np.datetime64(
        datetime.strptime(x,"%m/%d/%Y %H:%M:%S")))

In [15]:
trip_data.dtypes


Out[15]:
tripduration                        int64
starttime                  datetime64[ns]
stoptime                   datetime64[ns]
start station id                    int64
start station name                 object
start station latitude            float64
start station longitude           float64
end station id                      int64
end station name                   object
end station latitude              float64
end station longitude             float64
bikeid                              int64
usertype                           object
birth year                        float64
gender                              int64
dtype: object

Sampling of random bike-weeks


In [17]:
random_bikes = random.sample(list(trip_data['bikeid'].unique()),10)

In [18]:
sample_trips = pd.DataFrame(columns=trip_data.columns)
for day, bike in zip([6]*7,random_bikes):
    selected_trips = trip_data[(trip_data['starttime'] < datetime(2016,3,day + 7) & (selected_trips[selected_trips['stoptime'] >= datetime(2016,3, day) &
                                (selected_trips = selected_trips[selected_trips['bikeid'] == bike])]

    sample_trips = sample_trips.append(selected_trips)


  File "<ipython-input-18-47a41cdcf0bd>", line 4
    (selected_trips = selected_trips[selected_trips['bikeid'] == bike])]
                    ^
SyntaxError: invalid syntax

In [ ]:
print (sample_trips)

In [ ]: